Trajectory inference analysis: Slingshot

Loading libraries

suppressPackageStartupMessages({
    # Single cell libraries
    library(Seurat)
    library(scran)
    library(scater)

    # Plotting
    library(rafalib)
    library(cowplot)
    library(rgl)
    library(plotly)
    options(rgl.printRglwidget = TRUE)

    # Sparse Matrix manipulation tools
    library(Matrix)
    library(sparseMatrixStats)

    # Trajectory analysis
    library(slingshot)
    library(tradeSeq)

    # Dimensionality reduction
    library(destiny)
    library(fastICA)
})

# Define some color pallete
pal <- c((scales::hue_pal())(8), RColorBrewer::brewer.pal(9, "Set1"), RColorBrewer::brewer.pal(8,
    "Set2"))
set.seed(1)
pal <- rep(sample(pal, length(pal)), 200)

Nice function to easily draw a graph:

# Add graph to the base R graphics plot
draw_graph <- function(layout, graph, lwd = 0.2, col = "grey") {
    res <- rep(x = 1:(length(graph@p) - 1), times = (graph@p[-1] - graph@p[-length(graph@p)]))
    segments(x0 = layout[graph@i + 1, 1], x1 = layout[res, 1], y0 = layout[graph@i +
        1, 2], y1 = layout[res, 2], lwd = lwd, col = col)
}

Loading data

In order to speed up the computations during the exercises, we will be using a subset of a bone marrow dataset (originally containing about 100K cells). The bone marrow is the source of adult immune cells, and contains virtually all differentiation stages of cell from the imune system which later circulate in the blood to all other organs.

You can download the files we prepared with these commands:

# webpath <-
# 'https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/bone_marrow/'
# file_list <- c('trajectory_seurat.rds', 'trajectory_scran.rds')
# dir.create('./',recursive = T) for(i in file_list){ download.file( url =
# paste0(webpath,i) , destfile = paste0('./',i))}

If you have been using the scran/scater , Seurat or Scanpy pipelines with your own data, you need to reach to the point where can find get:

  • A dimensionality reduction where to perform the trajectory (for example: pca, ica, MNN, harmony, dm, UMAP)
  • The cell clustering information (for example: from louvain, kmeans)
  • A knn/snn graph (this is useful to inspect and sanity-check your trajectories)

Loading the dataset


We already have pre-computed and subseted the dataset (with 6688 cells and 3585 genes) following the anlaysis steps in this course. We then saved the objects, so you can use common tools to open and start to work with them (either in R or Python).

# SEURAT
obj <- readRDS("trajectory_seurat.rds")

# SCRAN
sce <- readRDS("trajectory_scran.rds")

Lets visualise which clusters we have in our dataset:

vars <- c("batches", "dataset", "clusters", "Phase")
pl <- list()

# SEURAT
for (i in vars) {
    pl[[i]] <- DimPlot(obj, group.by = i, label = T) + theme_void() + NoLegend()
}
plot_grid(plotlist = pl)

# SCRAN
for (i in vars) {
    pl[[i]] <- plotReducedDim(sce, "umap", colour_by = i, text_by = i) + theme_void() +
        NoLegend()
}
plot_grid(plotlist = pl)

You can check, for example how many cells are in each cluster:

# SEURAT
table(obj$clusters)
##
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
## 128  71  20 109  90 160 147 120 160 123 130 132  78  90 150 140  76 141  90  98
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40
## 149  90  10  33  56 154  98  76 125  44  64 150 150 146 150 148 135 128  79  70
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
## 145  12 134 110 149 140 113   4 132  85   6 126 129  57 129 137 147 127 118 120
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74
## 101  20   8  20  10  12   6  13  20   5  10  10  10  15
# SCRAN
table(sce$clusters)
##
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
## 128  71  20 109  90 160 147 120 160 123 130 132  78  90 150 140  76 141  90  98
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40
## 149  90  10  33  56 154  98  76 125  44  64 150 150 146 150 148 135 128  79  70
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
## 145  12 134 110 149 140 113   4 132  85   6 126 129  57 129 137 147 127 118 120
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74
## 101  20   8  20  10  12   6  13  20   5  10  10  10  15

Exploring the data


It is crutial that you performing analysis of a dataset understands what is going on, what are the clusters you see in your data and most importantly How are the clusters related to each other?. Well, let’s explore the data a bit. With the help of this table, write down which cluster numbers in your dataset express these key markers.

Marker Cell Type
Cd34 HSC progenitor
Ms4a1 B cell lineage
Cd3e T cell lineage
Ltf Granulocyte lineage
Cst3 Monocyte lineage
Mcpt8 Mast Cell lineage
Alas2 RBC lineage
Siglech Dendritic cell lineage
C1qc Macrophage cell lineage
Pf4 Megakaryocyte cell lineage
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech", "C1qc",
    "Pf4")
pl <- list()

# SEURAT
pl <- list(DimPlot(obj, group.by = "clusters", label = T) + theme_void() + NoLegend())
for (i in vars) {
    pl[[i]] <- FeaturePlot(obj, features = i, order = T) + theme_void() + NoLegend()
}
plot_grid(plotlist = pl)

# SCRAN
pl <- list(plotReducedDim(sce, "umap", colour_by = "clusters", text_by = "clusters") +
    theme_void() + NoLegend())
for (i in vars) {
    pl[[i]] <- plotReducedDim(sce, "umap", colour_by = i, by_exprs_values = "logcounts") +
        theme_void() + NoLegend()
}
plot_grid(plotlist = pl)

Another way to better explore your data is look in higher dimensions, to really get a sense for what is right or wrong. As mentioned in the dimensionality reduction exercises, here we ran umap with 3 dimensions (IMPORTANT: the umap needs to be computed to results in exactly 3 dimensions).

Since the steps below are identical to both Seurat and Scran pipelines, we ill extract the matrices from both, so it is clear what is being used where and to remove long lines of code used to get those matrices. We will use them all.

# SEURAT
NORM_COUNTS <- obj@assays$RNA@data
UMAP2 <- obj@reductions$umap@cell.embeddings
UMAP3 <- obj@reductions$umap3d@cell.embeddings
HARMONY <- obj@reductions$harmony_Phase@cell.embeddings
PCA <- obj@reductions$pca@cell.embeddings
PCA_loadings <- obj@reductions$pca@feature.loadings
clustering <- factor(obj$clusters)
KNN <- obj@graphs$knn


# SCRAN
NORM_COUNTS <- logcounts(sce)
UMAP2 <- reducedDim(sce, "umap")
UMAP3 <- reducedDim(sce, "umap3d")
HARMONY <- reducedDim(sce, "harmony_Phase")
PCA <- reducedDim(sce, "pca")
PCA_loadings <- attr(reducedDim(sce, "pca"), "rotation")
clustering <- factor(sce$clusters)
KNN <- reducedDim(sce, "knn")


# Calculate cluster centroids (for plotting the labels later)
mm <- sparse.model.matrix(~0 + factor(clustering))
colnames(mm) <- levels(factor(clustering))
centroids3d <- as.matrix(t(t(UMAP3) %*% mm)/Matrix::colSums(mm))
centroids2d <- as.matrix(t(t(UMAP2) %*% mm)/Matrix::colSums(mm))

Plot in 3D with RGL:

# Plot in 3D with RGL
rgl::open3d()
## null
##    1
points3d(x = UMAP3[, 1], y = UMAP3[, 2], z = UMAP3[, 3], col = pal[factor(clustering)])
text3d(t(centroids3d[, 1:3]), texts = rownames(centroids3d), cex = 1)
try(htmlwidgets::saveWidget(rglwidget(width = 1000, height = 800), selfcontained = T,
    "umap_3d_clustering_rgl.html"), silent = T)
browseURL("umap_3d_clustering_rgl.html")
rgl::close3d()

Or Plot in 3D with Plotly:

df <- data.frame(UMAP3, variable = clustering)
colnames(df)[1:3] <- c("UMAP_1", "UMAP_2", "UMAP_3")
p_State <- plot_ly(df, x = ~UMAP_1, y = ~UMAP_2, z = ~UMAP_3, color = ~variable,
    colors = pal, size = 0.5)
try(htmlwidgets::saveWidget(p_State, selfcontained = T, "umap_3d_clustering_plotly.html"),
    silent = T)
browseURL("umap_3d_clustering_plotly.html")
p_State

Data pre-processing prior trajectory inference

Before we take a dive into trajectory itself, we need to clean up the data a bit, so we can get nice figures later. This dataset is already clean in terms of guality of the cells and so on, but as you probably noticed in the clustering exercise, there is always some outlier cells in the “wrong” part of the plot (as in, far away from cell from the same cluster ).

In reality, this is distotion caused by the graph layout algorithm (UMAP in this case) that tries to force the data to be shown in 2 dimensions. If you check the 3D plots, the data points are in the “correct” position closer to other points in the same clusters (there is not a single misplaced point). Still, in most cases not even 3 dimensions is enough to represent the data correctly.

# calculate the distance from every cell to its neighbors
expected_U2d <- t(t(UMAP2) %*% KNN)/colSums2(KNN)
d <- rowSums((expected_U2d - UMAP2)^2)^(1/2)

# Define a distance cutoff
hist(d, breaks = 400)
cutoff <- mean(d) + 5 * sd(d)
abline(v = (cutoff), col = "red", xpd = F)

cutoff
## [1] 0.7806166
to_keep <- (d < cutoff)

mypar()
plot(UMAP2, type = "n")
draw_graph(layout = UMAP2, graph = KNN)
points(UMAP2, cex = ifelse(!to_keep, 1, 0.3), lwd = ifelse(!to_keep, 2, 0), bg = pal[clustering],
    pch = 21)
text(centroids2d, labels = rownames(centroids2d), cex = 1, font = 2)

In most cases this doesn’t affect any results, but it affects the trajectories. As there are many ways to handle this, but the simplest is just change of the outlier cells to be closer its “cluster-mates”. We only need to do this for the UMAP 2D layout.

new_UMAP2 <- UMAP2
res <- as.matrix(t(t(UMAP2) %*% KNN)/colSums2(KNN))
new_UMAP2[!to_keep, ] <- res[!to_keep, ]

new_centroids2d <- as.matrix(t(t(new_UMAP2) %*% mm)/Matrix::colSums(mm))

And let’s plot the UMAP again:

# Check the umap in 2D
mypar()
plot(new_UMAP2, type = "n")
draw_graph(layout = new_UMAP2, graph = KNN)
points(new_UMAP2, cex = ifelse(!to_keep, 1, 0.3), lwd = ifelse(!to_keep, 2, 0), bg = pal[clustering],
    pch = 21)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)

Much better!

Trajectory inference with Slingshot


Until up to this point, the steps above have been somewhat covered in the previous lectures. From now on, we will start using that clustering and data reduction techniques for trajectory inference.

Which dimensionality reduction to use?

Here, the choice upon which dimension to use have a great impact on your results. As explained above and in the dimensionality reduction exercise, using a 2D UMAP to create trajectories is NOT a good idea, it is literally a distorsion of your data! Instead, we should use other multidemnsional representations (more than 2D) for the calculations and only later then visualise in your UMAP 2D.

Here, we can run even some other additional dimensionality reduction methods (ICA and DiffusionMaps) on top of the integrated harmony embedding, where the batch effects were corrected.

Let’s firts explore ICA:

# Computing ICA
ICA_object <- fastICA::fastICA(X = HARMONY, n.comp = 20, method = "C", row.norm = T)
## colstandard
ICA <- ICA_object$S
colnames(ICA) <- paste0("ICA_", 1:ncol(ICA))

mypar(3, 3)
plot(new_UMAP2, pch = 16, col = pal[clustering])
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)

for (i in 1:8) {
    cc <- t(t(ICA[, c(i * 2 - 1, i * 2)]) %*% mm)/Matrix::colSums(mm)
    plot(ICA[, c(i * 2 - 1, i * 2)], pch = 16, col = pal[clustering])
    text(cc[, 1], cc[, 2], labels = rownames(cc), cex = 1, font = 2)
}

Let’s now explore DM:

# Computing Diffusion Maps
DM_object <- destiny::DiffusionMap(data = HARMONY, k = 20, sigma = "global")
DM <- DM_object@eigenvectors
colnames(DM) <- paste0("DC_", 1:ncol(DM))

mypar(3, 3)
plot(new_UMAP2, pch = 16, col = pal[clustering])
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)

for (i in 1:8) {
    cc <- t(t(DM[, c(i * 2 - 1, i * 2)]) %*% mm)/Matrix::colSums(mm)
    plot(DM[, c(i * 2 - 1, i * 2)], pch = 16, col = pal[clustering])
    text(cc[, 1], cc[, 2], labels = rownames(cc), cex = 1, font = 2)
}

# DM_object@transitions res <- destiny::DPT(DM_object) destiny::plot.DPT(res)
# res@branch

Defining cell lineages with Slingshot

Let run default Slingshot lineage identification on the 2D UMAP to get a first impression about it (for the sake of simplycity), but you are welcome to explore different embeddings and use more dimensions. The whole process can be done using a single function named slingshot, which is simply a wrapper for the 2 main steps for trajectory inference. The first step of the process is to define the lineages and then fit a curve through the data that defines a trajectory. These steps are break down below for clarity.

# Run Slingshot on UMAP3d
set.seed(1)
lineages <- getLineages(data = new_UMAP2, clusterLabels = clustering)

# Change the reduction (FOR VISUALISATION ONLY, in case you use another
# dimension for calculations)
lineages@reducedDim <- new_UMAP2

# Plot the lineages
mypar(1, 2)
plot(new_UMAP2, col = pal[clustering], cex = 0.5, pch = 16)
lines(lineages, lwd = 2, col = "black", cex = 3)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 0.8, font = 2, col = "white")

# Check the umap in 2D
plot(new_UMAP2, type = "n")
draw_graph(layout = new_UMAP2, graph = KNN)
points(new_UMAP2, cex = ifelse(!to_keep, 1, 0.3), lwd = ifelse(!to_keep, 2, 0), bg = pal[clustering],
    pch = 21)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 0.8, font = 2)

Now take a closer look on the object, which lineages you see? Which is the starting cell cluster?

print(lineages)
## class: SlingshotDataSet
##
##  Samples Dimensions
##     6688          2
##
## lineages: 21
## Lineage1: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  49  64
## Lineage2: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  49  50
## Lineage3: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  62
## Lineage4: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  69
## Lineage5: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  59  61  44  60  58  29  42  56  66
## Lineage6: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  59  61  44  60  58  29  42  56  74
## Lineage7: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  26  21  12  20  16  51
## Lineage8: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  38  53
## Lineage9: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  10  55  73
## Lineage10: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  26  21  65
## Lineage11: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  36  37  41  72  33
## Lineage12: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  7  32  6  54  25
## Lineage13: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  7  32  6  54  68
## Lineage14: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  7  32  6  54  71
## Lineage15: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  7  32  6  27
## Lineage16: 70  19  17  22  28  14  13  5  2  1  9  46  15  45  35  11  18  34  57
## Lineage17: 70  19  17  22  28  14  13  5  39  40  4  48  63  24
## Lineage18: 70  19  17  22  28  14  13  5  39  40  31  67
## Lineage19: 70  19  17  22  28  14  13  5  23
## Lineage20: 70  19  17  22  28  14  3
## Lineage21: 70  19  30
##
## curves: 0

Here we see one central issue with trajectory analysis: where does the trajectory begin? Without any extra information, this is nearly an impossible task for any T.I. method. We need prior biological information to be able to define where the trajectory starts and where it should end.

First, we need to make sure to identify which cluster is the progenitor cell. In this case, they express the marker CD34.

Marker Cell Type
Cd34 HSC progenitor
Ms4a1 B cell lineage
Cd3e T cell lineage
Ltf Granulocyte lineage
Cst3 Monocyte lineage
Mcpt8 Mast Cell lineage
Alas2 RBC lineage
Siglech Dendritic cell lineage
# SEURAT
plot_grid(plotlist = list(DimPlot(obj, group.by = "clusters", label = T) + theme_void() +
    NoLegend(), FeaturePlot(obj, features = "Cd34", order = T) + theme_void() + NoLegend()))

# SCRAN
plot_grid(plotlist = list(plotReducedDim(sce, "umap", colour_by = "clusters", text_by = "clusters") +
    theme_void() + NoLegend(), plotReducedDim(sce, "umap", colour_by = "Cd34", by_exprs_values = "logcounts") +
    theme_void() + NoLegend()))

Then, we can insert that information on where the trajectory starts on the getLineages function.

# Run Slingshot on UMAP3d
set.seed(1)
lineages <- getLineages(data = new_UMAP2,
                        clusterLabels = clustering,
                        #end.clus = c("4","3","13","9"), # You can also define the ENDS!
                        start.clus = "57")               # efine where to START the trajectories

# Change the reduction (FOR VISUALISATION ONLY, in case you use another dimension for calculations)
lineages@reducedDim <- new_UMAP2


#Plot the lineages
mypar() ; plot(new_UMAP2, col = pal[clustering],  cex=.5,pch = 16)
lines(lineages, lwd = 2, col = 'black', cex=3 )
text(new_centroids2d,labels = rownames(new_centroids2d),cex=0.8,font=2,col = "white")

If you compare this plot with the previous, you will not notice many differences, but let’s check

print(lineages)
## class: SlingshotDataSet
##
##  Samples Dimensions
##     6688          2
##
## lineages: 21
## Lineage1: 57  34  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  49  64
## Lineage2: 57  34  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  49  50
## Lineage3: 57  34  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  62
## Lineage4: 57  34  18  36  37  41  72  10  55  59  61  44  60  58  29  8  43  52  47  69
## Lineage5: 57  34  18  11  35  45  15  46  9  1  2  5  13  14  28  22  17  19  70
## Lineage6: 57  34  18  11  35  45  15  46  9  1  2  5  13  14  28  22  17  19  30
## Lineage7: 57  34  18  11  35  45  15  46  9  1  2  5  39  40  4  48  63  24
## Lineage8: 57  34  18  36  37  41  72  10  55  59  61  44  60  58  29  42  56  66
## Lineage9: 57  34  18  36  37  41  72  10  55  59  61  44  60  58  29  42  56  74
## Lineage10: 57  34  18  11  35  45  15  46  9  1  2  5  39  40  31  67
## Lineage11: 57  34  18  11  35  45  15  46  9  1  2  5  13  14  3
## Lineage12: 57  34  18  11  35  45  15  46  9  1  2  5  23
## Lineage13: 57  34  18  36  37  26  21  12  20  16  51
## Lineage14: 57  34  18  36  37  41  72  10  55  38  53
## Lineage15: 57  34  18  11  35  7  32  6  54  25
## Lineage16: 57  34  18  11  35  7  32  6  54  68
## Lineage17: 57  34  18  11  35  7  32  6  54  71
## Lineage18: 57  34  18  36  37  41  72  10  55  73
## Lineage19: 57  34  18  11  35  7  32  6  27
## Lineage20: 57  34  18  36  37  26  21  65
## Lineage21: 57  34  18  36  37  41  72  33
##
## curves: 0

What changed?

The issue with unconnected clusters

As you have probably noticed, there are many other small clusters in the dataset that do not express those markers. Those are other less abundant cell types, which the intermediate cell states were NOT present (because there were not enought cells captured)! Slingshot tries to fit a model / MST (minimum spanning tree) on the whole data, so for this example, we can look at the KNN graph to help us filter out some cells that can’t estimate trajectories with confidence.

Here, we will also remove some other small clusters just for the sake of simplicity on the steps below. In practice, sometimes it is easier to work with parts of the dataset at a time, in order to explore trajectories in more details.

mypar()
plot(new_UMAP2, col = pal[clustering], cex = 0.5, pch = 16)
draw_graph(layout = new_UMAP2, graph = KNN)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)

# Define clusters to filter
sort(table(clustering))
## clustering
##  48  70  51  67  63  23  65  71  72  73  42  66  68  74   3  62  64  69  24  30
##   4   5   6   6   8  10  10  10  10  10  12  12  13  15  20  20  20  20  33  44
##  25  54  31  40   2  17  28  13  39  50   5  14  19  22  20  27  61   4  44  47
##  56  57  64  70  71  76  76  78  79  85  90  90  90  90  98  98 101 109 110 113
##  59   8  60  10  29  52  58   1  38  53  55  11  12  49  43  37  56  16  46  18
## 118 120 120 123 125 126 127 128 128 129 129 130 132 132 134 135 137 140 140 141
##  41  34   7  57  36  21  45  15  32  33  35  26   6   9
## 145 146 147 147 148 149 149 150 150 150 150 154 160 160
clusters_to_remove <- c("71", "65", "73", "66", "56", "74", "42", "69", "62", "40",
    "30", "68", "70", "64", "67", "51", "24", "63", "48", "3", "10", "4", "31", "72",
    "39")
cell_to_keep <- !(clustering %in% clusters_to_remove)



# Filtering clusters
filt_new_UMAP2 <- new_UMAP2[cell_to_keep, ]
filt_UMAP3 <- UMAP3[cell_to_keep, ]
filt_NORM_COUNTS <- NORM_COUNTS[, cell_to_keep]
filt_PCA <- PCA[cell_to_keep, ]
filt_HARMONY <- HARMONY[cell_to_keep, ]
filt_KNN <- KNN[cell_to_keep, cell_to_keep]
filt_clustering <- factor(clustering[cell_to_keep])
filt_new_centroids2d <- as.matrix(new_centroids2d[!(rownames(new_centroids2d) %in%
    clusters_to_remove), ])
filtcentroids3d <- as.matrix(centroids3d[!(rownames(centroids3d) %in% clusters_to_remove),
    ])



# Plot
mypar()
plot(filt_new_UMAP2, type = "n")
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[filt_clustering], cex = 0.5, pch = 16)
text(filt_new_centroids2d[, 1:2], labels = rownames(filt_new_centroids2d), cex = 1,
    font = 1)

The issue with very similar undistinguishible clusters

Another issue is related to the clustering resolution used. For trajectories, it is better to use fine-grain resolution to separate intermediate cell types. Again, there are many approaches to that, and here we will use the KNN graph to compute the amount of connections between clusters. If the amount of connections is above a certain threshold, we then merge those clusters. IMPORTANT: The code below is very usefull in many other single cell data analysis ocasions!

# Compute connections between clusters on a graph
filt_mm <- sparse.model.matrix(~0 + factor(filt_clustering))
colnames(filt_mm) <- levels(factor(filt_clustering))
d <- t(filt_KNN %*% filt_mm) %*% filt_mm/(t(t(colSums(filt_mm))) %*% t(colSums(filt_mm)))^(1/2)
diag(d) <- 0
d <- drop0(d)
pheatmap::pheatmap(d, clustering_method = "ward.D2")

# Merging similar clusters clusters
hist(d@x, breaks = 50)
cutoff <- 1.2 * (sum((d@x^2))/(length(d@x) - 1))^(1/2)
abline(v = (cutoff), col = "red", xpd = F)

cutoff
## [1] 2.536375
to_merge <- drop0((d > cutoff) * 1)
to_merge <- to_merge * lower.tri(to_merge)
diag(to_merge)[rowSums(to_merge) == 0] <- 1
# Plot cluster mappings
pheatmap::pheatmap(to_merge, cluster_rows = F, cluster_cols = F)

# Merge the cluster labels
mappings <- cbind(from = colnames(to_merge)[to_merge@i + 1], to = colnames(to_merge)[rep(x = 1:(length(to_merge@p) -
    1), times = (to_merge@p[-1] - to_merge@p[-length(to_merge@p)]))])
merged_filt_clustering <- factor(mappings[match(filt_clustering, mappings[, 1]),
    2])
merged_filt_mm <- sparse.model.matrix(~0 + factor(merged_filt_clustering))
colnames(merged_filt_mm) <- levels(factor(merged_filt_clustering))
merged_filt_new_centroids2d <- as.matrix(t(t(filt_new_UMAP2) %*% merged_filt_mm)/Matrix::colSums(merged_filt_mm))



# Plot the new clusterings
mypar()
plot(filt_new_UMAP2, type = "n")
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16)
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
    cex = 1, font = 1)

We can now compute the lineages on these filtered and re-grouped data. Note that the cluster containing our “Cd34” marker also have changed.

# Define lineage ends
ENDS <- c("17","27","25","16","26","53","49")

set.seed(1)
lineages <- getLineages(data = filt_new_UMAP2,
                        clusterLabels = merged_filt_clustering,
                        end.clus = ENDS,       # You can also define the ENDS!
                        start.clus = "34")     # efine where to START the trajectories


# IF NEEDED, ONE CAN ALSO MANULALLY EDIT THE LINEAGES, FOR EXAMPLE:
# sel <- sapply( lineages@lineages, function(x){rev(x)[1]} ) %in% ENDS 
# lineages@lineages <- lineages@lineages[ sel ]
# names(lineages@lineages) <- paste0("Lineage",1:length(lineages@lineages))
# lineages


# Change the reduction to our "fixed" UMAP2d (FOR VISUALISATION ONLY)
lineages@reducedDim <- filt_new_UMAP2

mypar() ; plot(filt_new_UMAP2, col = pal[merged_filt_clustering],  cex=.5,pch = 16)
lines(lineages, lwd = 1, col = 'black', cex=2 )
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),cex=0.8,font=2,col = "white")

Much better!

Defining Principal Curves

Once the clusters are connected, Slingshot allows you to transform them to a smooth trajectory using principal curves. This is an algorithm that iteratively changes an initial curve to better match the data points. It was developed for linear data. To apply it to single-cell data, slingshot adds two enhancements:

  • It will run principal curves for each ‘lineage’, which is a set of clusters that go from a defined start cluster to some end cluster
  • Lineages with a same set of clusters will be constrained so that their principal curves remain bundled around the overlapping clusters

Since the function getCurves() takes some time to run, we can speed up the convergence of the curve fitting process by reducing the amount of cells to use in each lineage. Ideally you could all cells, but here we had set approx_points to 300 to speed up. Feel free to adjust that for your dataset.

# Define curves
curves <- getCurves(lineages, thresh = 0.1, stretch = 0.1, allow.breaks = F, approx_points = 100)
curves
## class: SlingshotDataSet
##
##  Samples Dimensions
##     5828          2
##
## lineages: 7
## Lineage1: 34  18  36  33  55  59  44  60  58  29  8  43  47  49
## Lineage2: 34  18  11  35  15  46  9  1  2  5  13  28  17
## Lineage3: 34  18  11  35  7  32  6  54  25
## Lineage4: 34  18  11  35  7  32  6  27
## Lineage5: 34  18  36  21  12  20  16
## Lineage6: 34  18  36  33  55  38  53
## Lineage7: 34  18  36  26
##
## curves: 7
## Curve1: Length: 6.7209   Samples: 2148.23
## Curve2: Length: 7.3478   Samples: 2108.92
## Curve3: Length: 3.463    Samples: 1285.44
## Curve4: Length: 2.546    Samples: 1203.13
## Curve5: Length: 2.9266   Samples: 971.56
## Curve6: Length: 2.8915   Samples: 1071.83
## Curve7: Length: 2.1368   Samples: 637.54
# Plots
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], pch = 16)
lines(curves, lwd = 2, col = "black")
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
    cex = 1, font = 2)

With those results in hands, we can now compute the differentiation pseudotime.

pseudotime <- slingPseudotime(curves, na = FALSE)
cellWeights <- slingCurveWeights(curves)

x <- rowMeans(pseudotime)
x <- x/max(x)
o <- order(x)
mypar()
plot(filt_new_UMAP2[o, ], main = paste0("pseudotime"), pch = 16, cex = 0.4, axes = F,
    xlab = "", ylab = "", col = colorRampPalette(c("grey70", "orange3", "firebrick",
        "purple4"))(99)[x[o] * 98 + 1])
points(merged_filt_new_centroids2d, cex = 2.5, pch = 16, col = "#FFFFFF99")
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
    cex = 1, font = 2)

IMPORTANT: The pseudotime represents the distance of every cell to the starting cluster!

Generating batch-corrected data for differential gene expression


Before computing differential gene expression, it is a good idea to make sure our dataset is somewhat homogeneous (without very strong batch effects). In this dataset, we actually used data from 4 different technologies (Drop-seq, SmartSeq2 and 10X) and therefore massive differences in read counts can be observed:

# SEURAT
VlnPlot(obj, features = "nUMI", group.by = "batches")

# SCRAN
plotColData(sce, y = "nUMI", x = "batches", colour_by = "batches")

Since we are not interested in the effects of the batches in this example, but only the differentiation paths for each cell type. We can use the integrated space of harmony embeddings (where we removed batch effects). Since the harmony (same applies to MNN, scanorama, LIGER ) is a coorected verion of PCA, we can multiply the harmony embbedings with PCA loadings to generate batch-corrected “pseudocounts”. Note that we can only reconstruct data from the highly variable genes that were used to compute PCA and HARMONY.

# Get the gene means and standar deviation
library(sparseMatrixStats)
genes <- rownames(PCA_loadings)
gene_means <- rowMeans2(filt_NORM_COUNTS[genes, ])
gene_sd <- sqrt(rowVars(filt_NORM_COUNTS[genes, ]))

# Project normalized gene counts
CORRECTED_NORMCOUNTS <- t(filt_HARMONY %*% t(PCA_loadings)) * gene_sd + gene_means -
    0.02
CORRECTED_NORMCOUNTS <- Matrix(round(CORRECTED_NORMCOUNTS, 3), sparse = T)
CORRECTED_NORMCOUNTS@x[CORRECTED_NORMCOUNTS@x < 0] <- 0
CORRECTED_NORMCOUNTS <- drop0(CORRECTED_NORMCOUNTS)

# Transform the normalized data back to raw counts (used for differential
# expression)
CORRECTED_COUNTS <- round((expm1(CORRECTED_NORMCOUNTS)) * 1000)

Let’s compare how the normalized data compares to the batch-correceted one.

mypar(3, 3)

plot(filt_new_UMAP2, type = "n")
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[filt_clustering], pch = 16)
text(merged_filt_new_centroids2d[, 1], merged_filt_new_centroids2d[, 2], labels = rownames(merged_filt_new_centroids2d),
    cex = 0.8, font = 2)

vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech")
for (i in vars) {
    plot(filt_NORM_COUNTS[i, ], CORRECTED_NORMCOUNTS[i, ], main = i, pch = 16, cex = 0.4)
    rr <- c(diff(range(filt_NORM_COUNTS[i, ]))/50, (range(CORRECTED_NORMCOUNTS[i,
        ])))
    polygon(c(-rr[1], -rr[1], rr[1], rr[1]), c(rr[3], rr[2], rr[2], rr[3]), border = "red")
    text(rr[1], max(CORRECTED_NORMCOUNTS[i, ]), " < Imputed\n    counts", adj = c(0,
        1), col = "red", font = 2)
}

IMPORTANT: Please note in the graphs above that there is a significan amount of imputation (i.e., we artificially add counts to certain cells where we’d expect to see ). Please keep this in mind and use these matrices with caution in downstream analysis!

Let’s also take a closer inspection on the UMAPs:

mypar(4, 5, mar = c(0.1, 0.1, 2, 1))

vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech", "C1qc")

for (j in c("filt_NORM_COUNTS", "CORRECTED_NORMCOUNTS")) {
    plot(filt_new_UMAP2, type = "n", axes = F, xlab = "", ylab = "", main = j)
    draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
    points(filt_new_UMAP2, col = pal[merged_filt_clustering], pch = 16)
    text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
        cex = 0.8, font = 2)
    for (i in vars) {
        x <- get(j)[i, ]
        x <- x - min(x)/(max(x) - min(x))
        o <- order(x)
        plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.4, axes = F,
            xlab = "", ylab = "", col = colorRampPalette(c("lightgray", "blue"))(99)[x[o] *
                98 + 1])
    }
}

Finding differentially expressed genes


The main way to interpret a trajectory is to find genes that change along the trajectory. There are many ways to define differential expression along a trajectory:

  • Expression changes along a particular path (i.e. change with pseudotime)
  • Expression differences between branches
  • Expression changes at branch points
  • Expression changes somewhere along the trajectory

tradeSeq is a recently proposed algorithm to find trajectory differentially expressed genes. It works by smoothing the gene expression along the trajectory by fitting a smoother using generalized additive models (GAMs), and testing whether certain coefficients are statstically different between points in the trajectory.

BiocParallel::register(BiocParallel::MulticoreParam())

The fitting of GAMs can take quite a while, so for demonstration purposes we first do a very stringent filtering of the genes.

IMPORTANT: In an ideal experiment, you would use all the genes, or at least those defined as being variable.

sel_cells <- split(colnames(CORRECTED_COUNTS), merged_filt_clustering)
sel_cells <- unlist(lapply(sel_cells, function(x) {
    set.seed(1)
    return(sample(x, 20))
}))

gv <- as.data.frame(na.omit(modelGeneVar(CORRECTED_NORMCOUNTS[, sel_cells])))
gv <- gv[order(gv$bio, decreasing = T), ]
sel_genes <- sort(rownames(gv)[1:500])

Fitting the model:

sceGAM <- tradeSeq::fitGAM(counts = drop0(CORRECTED_COUNTS[sel_genes, sel_cells]),
    pseudotime = pseudotime[sel_cells, ], cellWeights = cellWeights[sel_cells, ],
    nknots = 10, verbose = T, parallel = T, BPPARAM = BiocParallel::MulticoreParam())
##
  |
  |                                                                      |   0%
  |
  |==                                                                    |   2%
  |
  |====                                                                  |   5%
  |
  |=====                                                                 |   8%
  |
  |=======                                                               |  10%
  |
  |=========                                                             |  12%
  |
  |==========                                                            |  15%
  |
  |============                                                          |  18%
  |
  |==============                                                        |  20%
  |
  |================                                                      |  22%
  |
  |==================                                                    |  25%
  |
  |===================                                                   |  28%
  |
  |=====================                                                 |  30%
  |
  |=======================                                               |  32%
  |
  |========================                                              |  35%
  |
  |==========================                                            |  38%
  |
  |============================                                          |  40%
  |
  |==============================                                        |  42%
  |
  |================================                                      |  45%
  |
  |=================================                                     |  48%
  |
  |===================================                                   |  50%
  |
  |=====================================                                 |  52%
  |
  |======================================                                |  55%
  |
  |========================================                              |  58%
  |
  |==========================================                            |  60%
  |
  |============================================                          |  62%
  |
  |==============================================                        |  65%
  |
  |===============================================                       |  68%
  |
  |=================================================                     |  70%
  |
  |===================================================                   |  72%
  |
  |====================================================                  |  75%
  |
  |======================================================                |  78%
  |
  |========================================================              |  80%
  |
  |==========================================================            |  82%
  |
  |============================================================          |  85%
  |
  |=============================================================         |  88%
  |
  |===============================================================       |  90%
  |
  |=================================================================     |  92%
  |
  |==================================================================    |  95%
  |
  |====================================================================  |  98%
  |
  |======================================================================| 100%
plotGeneCount(curves, clusters = merged_filt_clustering, models = sceGAM)

lineages
## class: SlingshotDataSet
##
##  Samples Dimensions
##     5828          2
##
## lineages: 7
## Lineage1: 34  18  36  33  55  59  44  60  58  29  8  43  47  49
## Lineage2: 34  18  11  35  15  46  9  1  2  5  13  28  17
## Lineage3: 34  18  11  35  7  32  6  54  25
## Lineage4: 34  18  11  35  7  32  6  27
## Lineage5: 34  18  36  21  12  20  16
## Lineage6: 34  18  36  33  55  38  53
## Lineage7: 34  18  36  26
##
## curves: 0
lc <- sapply(lineages@lineages, function(x) {
    rev(x)[1]
})
names(lc) <- gsub("Lineage", "L", names(lc))

mypar()
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], pch = 16)
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 16, cex = 4)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white")

Genes that change with pseudotime

We can first look at general trends of gene expression across pseudotime.

res <- na.omit(associationTest(sceGAM))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, ]

We can plot their expression:

mypar(4, 4, mar = c(0.1, 0.1, 2, 1))

plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16, axes = F,
    xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white",
    xpd = T)

vars <- rownames(res[1:15, ])

for (i in vars) {
    x <- CORRECTED_NORMCOUNTS[i, ]
    x <- (x - min(x))/(max(x) - min(x))
    o <- order(x)
    plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.5, axes = F, xlab = "",
        ylab = "", col = colorRampPalette(c("lightgray", "grey60", "navy"))(99)[x[o] *
            98 + 1])
}

Genes that change between two pseudotime points

We can define custom pseudotime values of interest if we’re interested in genes that change between particular point in pseudotime. By default, we can look at differences between start and end:

res <- na.omit(startVsEndTest(sceGAM, pseudotimeValues = c(0, 1)))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, 1:6]

You can see now that there are several more columns, one for each lineage. This table represents the differential expression within each lineage, to identify which genes go up or down. Let’s check lineage 1:

# Get the top UP and Down regulated in lineage 1
res_lin1 <- sort(setNames(res$logFClineage1, rownames(res)))
vars <- names(c(rev(res_lin1)[1:7], res_lin1[1:8]))

mypar(4, 4, mar = c(0.1, 0.1, 2, 1))

plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16, axes = F,
    xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white",
    xpd = T)

for (i in vars) {
    x <- CORRECTED_NORMCOUNTS[i, ]
    x <- (x - min(x))/(max(x) - min(x))
    o <- order(x)
    plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.5, axes = F, xlab = "",
        ylab = "", col = colorRampPalette(c("lightgray", "grey60", "navy"))(99)[x[o] *
            98 + 1])
}

Genes that are different between lineages

More interesting are genes that are different between two branches. We may have seen some of these genes already pop up in previous analyses of pseudotime. There are several ways to define “different between branches”, and each have their own functions:

  • Different at the end points, using diffEndTest
  • Different at the branching point, using earlyDETest
  • Different somewhere in pseudotime the branching point, using patternTest
  • Note that the last function requires that the pseudotimes between two lineages are aligned.
res <- na.omit(diffEndTest(sceGAM))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, 1:6]

You can see now that there are even more columns, one for the pair-wise comparisson between each lineage. Let’s check lineage 1 vs lineage 2:

# Get the top UP and Down regulated in lineage 1 vs 2
res_lin1_2 <- sort(setNames(res$logFC1_2, rownames(res)))
vars <- names(c(rev(res_lin1_2)[1:7], res_lin1_2[1:8]))

mypar(4, 4, mar = c(0.1, 0.1, 2, 1))

plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16, axes = F,
    xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white",
    xpd = T)

for (i in vars) {
    x <- CORRECTED_NORMCOUNTS[i, ]
    x <- (x - min(x))/(max(x) - min(x))
    o <- order(x)
    plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.5, axes = F, xlab = "",
        ylab = "", col = colorRampPalette(c("lightgray", "grey60", "navy"))(99)[x[o] *
            98 + 1])
}

Check out this vignette for a more in-depth overview of tradeSeq and many other differential expression tests.

References


Cannoodt, Robrecht, Wouter Saelens, and Yvan Saeys. 2016. “Computational Methods for Trajectory Inference from Single-Cell Transcriptomics.” European Journal of Immunology 46 (11): 2496–2506. doi.

Saelens, Wouter, Robrecht Cannoodt, Helena Todorov, and Yvan Saeys. 2019. “A Comparison of Single-Cell Trajectory Inference Methods.” Nature Biotechnology 37 (5): 547–54. doi.

Additional comments

Loading the dataset with SEURAT

You can recreate your object in Seurat like so:

# # Create object with counts and metadata obj <- CreateSeuratObject( counts =
# Read10X_h5('./data/subset_BM_counts.h5') , meta.data =
# read.csv2('./data/subset_BM_metadata.csv',row.names = 1)) # Add normalized
# counts and variable features obj@assays$RNA@data <-
# Read10X_h5('./data/subset_BM_normdata.h5') obj@assays$RNA@var.features <-
# read.csv2('./data/subset_BM_var_features.csv',row.names = 1) # Add reductions
# and PCA loadings for(i in
# c('pca','harmony','harmony_Phase','umap','umap3d')){ obj@reductions[[i]] <-
# CreateDimReducObject( embeddings =
# as.matrix(Read10X_h5(paste0('./data/subset_BM_',i,'.h5'))) ,key =
# paste0(i,'_'))} obj@reductions$pca@feature.loadings <-
# as.matrix(Read10X_h5(paste0('./data/subset_BM_pca_feature_loadings.h5'))) #
# Add KNN graph obj@graphs[['knn']] <-
# Read10X_h5(paste0('./data/subset_BM_knn.h5')) obj
# saveRDS(obj,'trajectory_seurat.rds')

You can recreate your object in Scran like so:

# sce <- SingleCellExperiment( assays=list(
# counts=Read10X_h5('./data/subset_BM_counts.h5'),
# logcounts=Read10X_h5('./data/subset_BM_normdata.h5')),
# reducedDims=SimpleList(
# pca=as.matrix(Read10X_h5(paste0('./data/subset_BM_pca.h5'))),
# harmony=as.matrix(Read10X_h5(paste0('./data/subset_BM_harmony.h5'))),
# harmony_Phase=as.matrix(Read10X_h5(paste0('./data/subset_BM_harmony_Phase.h5'))),
# umap=as.matrix(Read10X_h5(paste0('./data/subset_BM_umap.h5'))),
# umap3d=as.matrix(Read10X_h5(paste0('./data/subset_BM_umap3d.h5'))),
# knn=Read10X_h5(paste0('./data/subset_BM_knn.h5'))) ) sce@colData <-
# DataFrame(read.csv2('./data/subset_BM_metadata.csv',row.names = 1)) attr(x =
# reducedDim(sce,'pca'),which = 'rotation') <-
# as.matrix(Read10X_h5(paste0('./data/subset_BM_pca_feature_loadings.h5')))
# saveRDS(sce,'trajectory_scran.rds')